## Introduction
In this project we will be using the H01L patent file, about Semiconductor devices and electric solid state devices. https://www.uspto.gov/web/patents/classification/cpc/html/cpc-H01L.html



In [1]:
import re, timeit
from collections import Counter
from nltk.tokenize import MWETokenizer
from nltk.util import Trie

class FreqMWETokenizer(MWETokenizer):
    """A tokenizer that processes tokenized text and merges multi-word expressions
    into single tokens.
    """

    def __init__(self, mwes=None, separator="_"):
        super().__init__(mwes, separator)

    def freqs(self, text):
        """
        :param text: A list containing tokenized text
        :type text: list(str)
        :return: A frequency dictionary with multi-words merged together as keys
        :rtype: dict
        :Example:
        >>> tokenizer = FreqMWETokenizer([ mw.split() for mw in ['multilayer ceramic', 'multilayer ceramic capacitor', 'ceramic capacitor']], separator=' ')
        >>> tokenizer.freqs("Gimme that multilayer ceramic capacitor please!".split())
        {'multilayer ceramic': 1, 'multilayer ceramic capacitor': 1, 'ceramic capacitor': 1}
        """
        i = 0
        n = len(text)
        result = Counter()

        while i < n:
            if text[i] in self._mwes:
                # possible MWE match
                j = i
                trie = self._mwes
                while j < n and text[j] in trie:
                    if Trie.LEAF in trie:
                        # success!
                        mw = self._separator.join(text[i:j])
                        result[mw]=result.get(mw,0)+1
                    trie = trie[text[j]] # diving one step deeper into the trie
                    j = j + 1
                else: # executed if while did not break
                    if Trie.LEAF in trie:
                        # success!
                        mw = self._separator.join(text[i:j])
                        result[mw]=result.get(mw,0)+1
                    i += 1
            else:
                i += 1

        return result
    
mwes = [mw for mw in open("manyterms.lower.txt").read().strip().split('\n') if 'the ' not in mw and mw!='number of']
splimwes = [ mw.split() for mw in mwes]
tokenizer = FreqMWETokenizer(splimwes, separator=' ')

In [2]:
import pandas as pd
h01l = '\n'.join([li for li in open('H01L.txt').read().split('\n') if li and li[:4]!='____'])
c,w = len(h01l), len(h01l.split())
print('sanity check:',c,'characters',w,'words',round(c/w,1),'chars per word')
print('looking for all potential multiwords in the patent text...')
start = timeit.default_timer()
counter = tokenizer.freqs(h01l.split())
secs = timeit.default_timer()-start
print('it took', secs, 'seconds,', secs/len(mwes),'per term')
h01ldf = pd.DataFrame.from_dict(counter, orient='index').sort_values(by = 0, ascending = False)
h01ldf.index.name = 'term'
h01ldf.columns = ['freq']
h01ldf

sanity check: 141679050 characters 22899763 words 6.2 chars per word
looking for all potential multiwords in the patent text...
it took 11.156254907999937 seconds, 1.5538803511611267e-05 per term


Unnamed: 0_level_0,freq
term,Unnamed: 1_level_1
solar cell,9476
semiconductor device,9073
display device,8602
gate electrode,7799
thin film,7709
...,...
local color,1
spectral purity,1
ultra-violet radiation,1
stress testing,1


In [8]:
h01ldf.to_csv('h01ldf.tsv',sep='\t')

In [9]:
h01ldf=pd.read_csv('h01ldf.tsv',sep='\t',index_col='term')
h01ldf

Unnamed: 0_level_0,freq
term,Unnamed: 1_level_1
solar cell,9476
semiconductor device,9073
display device,8602
gate electrode,7799
thin film,7709
...,...
local color,1
spectral purity,1
ultra-violet radiation,1
stress testing,1


In [3]:
import spacy, timeit
from spacy import displacy
from spacy.matcher import PhraseMatcher
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

from sense2vec import Sense2Vec, Sense2VecComponent
s2v = Sense2VecComponent(nlp.vocab).from_disk("s2v_reddit_2015_md/s2v_old")
nlp.add_pipe(s2v)

In [None]:
termsins2v = []
for t in tqdm(h01ldf.index):
    if nlp(t)[:]._.in_s2v:
        termsins2v+=[t]
print(len(termsins2v))

  6%|▌         | 583/10374 [00:02<00:40, 242.14it/s]

In [None]:
', '.join(termsins2v[:250])

In [None]:
!python3 -m prodigy sense2vec.teach patents s2v_reddit_2015_md/s2v_old --seeds "solar cell, display device, thin film, light source, carbon atoms, electronic device, solar cells, image sensor, control unit, single crystal, color filter, power generation, signal processing, heat sink, electric field, electronic component, thermal conductivity, room temperature, particle size, first light, field effect, electrical connection, optical system, energy level, thermal expansion, visible light, power line, electric power, surface layer, flow rate, raw material, red light, supply line, green light, high frequency, magnetic field, control device, flash memory, binding site, melting point, power source, fine particles, electric charge, vertical direction, rare earth, single bond, second source, electric current, crystal structure, current density, inert gas, surface area, high voltage, carbon atom, heat pipe, ultraviolet light, pressure chamber, current source, carbon nanotubes, pressure sensor, horizontal direction, upper limit, electronic devices, optical sensor, control line, composite material, work function, light sources, aspect ratio, low temperature, solar panel, solar power, solid state, direct current, carbon fibers, image processing, straight line, reduced pressure, display screen, light reflection, prior art, grain size, electrical signal, power generator, electromagnetic wave, weight ratio, transmission line, contact area, boiling point, heat source, measurement system, crystal growth, flat surface, heat radiation, screen printing, electrical conductivity, aluminum oxide, power consumption, low power, power converter, touch panel, top plate, excited state, phase change, computing device, front end, light sensor, electric charges, double bond, heat exchanger, data processing, wireless communication, adhesive tape, mesh network, raw materials, injection molding, first memory, integrated circuits, energy levels, power steering, magnetic flux, electric power steering, flat panel, low voltage, ground plane, management system, ultraviolet rays, mobile terminal, mechanical strength, vacuum pump, electromagnetic radiation, diffraction pattern, signal level, external force, potential difference, transparent material, volume ratio, electric potential, thermal radiation, voltage source, lower arm, cooling water, ceramic material, hot plate, pure water, circuit boards, measurement device, coordinate system, exhaust gas, metal wire, light energy, random access, voltage drop, surface tension, blue color, light beam, red color, low level, electric energy, pulse width, physical properties, mass ratio, solid solution, image quality, chemical formula, optical properties, electrical resistance, field strength, absorption spectrum, storage device, color temperature, distilled water, acetic acid, temperature control, magnetic sensor, electrostatic discharge, ambient temperature, phase shift, photovoltaic cells, storage tank, line width, small arm, thermal energy, high quality, electrical contacts, solar energy, inner ring, power density, thermal transfer, molecular structure, working fluid, electrical energy, computer program, second sound, magnetic material, optical fiber, electrical power, voltage difference, side chain, secondary battery, memory system, heat spreader, threshold value, hydrochloric acid, major axis, high density, output device, liquid phase, polyester resin, battery pack, energy storage, horizontal plane, supply lines, low-pass filter, thermal imaging, inner diameter, silicone rubber, memory chip, high resolution, second gap, atmospheric pressure, power point, focal length, thermal stress, vapor pressure, storage medium, oxidizing agent, organic solvents, head unit, notch filter, first position, second stage, power supply unit, data storage, electrical component, high resistance, power generators, magnesium oxide, periodic table, production process, solder joint, exhaust system, image plane, frequency response, static discharge, air pressure, deionized water, power lines, building material, natural light"

In [None]:
!python3 -m prodigy sense2vec.to-patterns patents en_core_web_sm TECH --output-file termsh01lpatterns.jsonl

In [None]:
import re
fig = re.compile(r'(figs?)\.',re.I)
h01lpatents = [fig.sub(r'\1',pat) for pat in open('H01L.txt').read().split('\n\n\n')]
print(len(h01lpatents))

In [None]:
import random
sentsplit = re.compile('[\n.;]')
h01lsents = [li.strip() for pat in h01lpatents[:12] for li in sentsplit.split(pat) if len(li)>25 and '____' not in li]
random.shuffle(h01lsents)
print(len(h01lsents))
open('h01lsents.3000.txt','w').write('\n'.join(h01lsents))
h01lsents[:5]

In [None]:
!python3 -m prodigy ner.manual annotatedpatents en_core_web_sm h01lsents.3000.txt --loader txt --patterns termsh01lpatterns.jsonl --label TECH

In [None]:
!python3 -m prodigy ner.teach annotatedpatents en_core_web_sm h01lsents.3000.txt --loader txt --patterns termsh01lpatterns.jsonl --label TECH


In [None]:
!python3 -m spacy download en_vectors_web_lg

In [None]:
!python3 -m prodigy train ner annotatedpatents en_vectors_web_lg --init-tok2vec ./tok2vec_cd8_model289.bin --output ./tmp_model --eval-split 0.2


In [None]:
!python3 -m prodigy train-curve ner annotatedpatents en_vectors_web_lg --init-tok2vec ./tok2vec_cd8_model289.bin  --eval-split 0.2


In [None]:
!python3 -m prodigy ner.correct annotatedpatents_correct ./tmp_model h01lsents.3000.txt --loader txt --label TECH --exclude annotatedpatents

In [None]:
!python3 -m prodigy train ner annotatedpatents,annotatedpatents_correct en_vectors_web_lg --init-tok2vec ./tok2vec_cd8_model289.bin --output ./tech_model --eval-split 0.2 --n-iter 20


In [None]:
!python3 -m prodigy train-curve ner annotatedpatents,annotatedpatents_correct en_vectors_web_lg --init-tok2vec ./tok2vec_cd8_model289.bin  --eval-split 0.2 


In [None]:
h01lsents_rest = [li.strip() for pat in h01lpatents[12:] for li in sentsplit.split(pat) if len(li)>25 and '____' not in li]
random.shuffle(h01lsents_rest)
print(len(h01lsents_rest))
open('h01lsents.rest.txt','w').write('\n'.join(h01lsents_rest))
h01lsents_rest[:5]

In [4]:
SPACY_MODEL = "./tech_model"      # path to spaCy model with entity recognizer
nlp = spacy.load(SPACY_MODEL)


In [None]:
colors = {"TECH": "linear-gradient(90deg, #fc9ce7, blue)"}
options = {"ents": ["TECH"], "colors": colors}
displacy.render(doc, style="ent", options=options)

In [None]:
def get_entity(term):
    for i in range(12):
        doc = nlp(h01lpatents[i])
        for x in doc.ents:
            if x.text == term:
                return x
    return -1

In [None]:
def get_similarity_vector(ent):
    vector = []
    for i in range(12):
        doc = nlp(h01lpatents[i])
        for x in doc.ents:
            vector.append((x, ent.similarity(x)))
    vector.sort(key=lambda x: x[1], reverse = True)
    return vector


In [None]:
unknown_entity = get_entity('LAN port')


In [None]:
similarity_vector = get_similarity_vector(unknown_entity)

In [None]:
for ent,val in similarity_vector:
    if in_wikipedia(ent.text)!= '':
        print(ent,val)
        break


In [None]:
def get_similar_wiki_page(term):
    link = in_wikipedia(term)
    if  link != '':
        return term,link,1
    else:
        similarity_vector = get_similarity_vector(get_entity(term))
        for ent,val in similarity_vector:
            link = in_wikipedia(ent.text)
            if link!= '':
                return ent.text,link,val
                

In [None]:
get_similar_wiki_page('LAN port')

In [None]:
old_entities = set([])
for i in range(12):
    doc = nlp(h01lpatents[i])
    ents = [x.text for x in doc.ents]
    
    old_entities.update(set(ents))
    

In [None]:
new_entities = set([])
for i in range(12, len(h01lpatents)):
    doc = nlp(h01lpatents[i])
    ents = [x.text for x in doc.ents]
    
    new_entities.update(set(ents))
    

In [None]:
dif_entities = new_entities.difference(old_entities)

In [None]:
print('Dif', len(dif_entities))

In [None]:
print(len(old_entities))
print(len(new_entities))
entities = new_entities.union(old_entities)
intersection_entities = new_entities.intersection(old_entities)
print(len(intersection_entities))

In [None]:
import pandas as pd
df = pd.DataFrame(columns= ['annotated_terms'])
df['annotated_terms'] = list(entities)
df['old_data'] = df['annotated_terms'].map(lambda a: a in old_entities)
df['new_data']  = df['annotated_terms'].map(lambda a: a in new_entities)
df['dif_data']   = df['annotated_terms'].map(lambda a: a in dif_entities)
dif_terms = df[df['dif_data']]['annotated_terms'].values
dif_terms

In [None]:
i = 0
sampled_entities = []
for term in entities:
    i+=1
    sampled_entities.append(term)
    if i >1000:
        break

In [None]:
unknown_terms = []
wiki_terms = {}
for term in old_entities:
    if in_wikipedia(term) == '':
        unknown_terms.append(term)
        print(term)
    

In [None]:
print(len(unknown_terms))

In [9]:
from os import system

def in_wikipedia(term):
    
    term = str(term).replace(' ', '_')

    ret = system('wget -q https://en.wikipedia.org/wiki/%s' % term)    
    if ret == 0:
        system('rm %s' % term)
        return 'https://en.wikipedia.org/wiki/%s' % term
    
    term = term.lower()
    ret = system('wget -q https://en.wikipedia.org/wiki/%s' % term)    
    
    if ret == 0:
        system('rm %s' % term)
        return 'https://en.wikipedia.org/wiki/%s' % term
    elif ret == 2048:
        return ''
    else:
        print('Error with: %s' % term)
        return ''

print(in_wikipedia('Diode'))
print(in_wikipedia('JBL'))
print(in_wikipedia('Enterprise content management'))
print(in_wikipedia('nah naaaah') == '')

https://en.wikipedia.org/wiki/Diode
https://en.wikipedia.org/wiki/JBL
https://en.wikipedia.org/wiki/Enterprise_content_management
True


In [None]:
for term in new_entities:
    if '.' in term:
        prin

In [None]:
for term in unknown_terms[:10]:
    print(term,get_similar_wiki_page(term),'\n')

In [8]:
def get_all_ngrams(term):
    words = term.split(' ')
    groups = []
    for n_words in range(len(words)-1,0,-1):
        for i in range(len(words)-n_words+1):
            groups.append(' '.join(words[i:i+n_words]))
    return groups
        

def get_best_similar_wiki_page(term):
    link = in_wikipedia(term)
    if  link != '':
        return term,link,1
    else:
        
        similarity_vector = get_similarity_vector(get_entity(term))
        for ent,val in similarity_vector:
            link = in_wikipedia(ent.text)
            if link!= '':
                if val>0.65: #if similar word, we take that work
                    return ent.text,link,val
                else:# We look for substrings:
                    for word in get_all_ngrams(term):
                        link = in_wikipedia(word)
                        if link!= '':
                            return word, link, -1
                    return ent.text, link, val
                    
def get_best_similar_wiki_page(term):
    clean_term = re.sub('[^a-zA-Z ]', '', term ).strip()
    link = in_wikipedia(clean_term)
    if  link != '':
        return term,link,1
    else:
        
        similarity_vector = get_similarity_vector(get_entity(term))
        for ent,val in similarity_vector:
            link = in_wikipedia(ent.text)
            if link!= '':
                if val>0.65: #if similar word, we take that work
                    return ent.text,link,val
                else:# We look for substrings:
                    for word in get_all_ngrams(term):
                        new_link = in_wikipedia(word)
                        if link!= '':
                            return word, new_link, -1
                    return ent.text, link, val
                    
                

In [14]:
def gen_html(text, ents_link):
    
    a=0
    b=0
    no_ent = True
    ent_in = False
    low_bound_a = False
    
    final_text = '<!DOCTYPE html><html><head><title>Page Title</title></head><body>'
    pos=0
    while(a < len(text)-1):
        if no_ent:
            # walk b until ent_in
            b+=1
            if b == len(text):
                break
            if any([i in text[a:b] for i in ents_link.keys()]):
                no_ent=False
                ent_in=True
                a=b
        
        if ent_in:
            # walk a to the beginning of the entity
            a-=1
            if any([i in text[a:b] for i in ents_link.keys()]):
                ent_in=False
                low_bound_a=True
        
        if low_bound_a:
            # walk higher b to while you find bigger entities starting at a
            hb = b+1
            while hb < len(text):
                if sum([i in text[a:hb] for i in ents_link.keys()]) > sum([i in text[a:hb-1] for i in ents_link.keys()]):
                    if True not in [i in text[a+1:hb] for i in ents_link.keys()]:
                        b=hb
                hb+=1
            final_text += text[pos:a]
            final_text += '<a href="%s">%s</a>' % (ents_link[text[a:b]], text[a:b])
            pos=b
            a=b
            low_bound_a=False
            no_ent=True

    final_text+='</body></html>'
    return final_text

text = 'Un Terminal mobile est un appareil portable permettant le traitement et léchange de données. Cela inclut les smartphones, les tablettes tactiles, ainsi que les appareils de communication avec un central de dispatching utilisés dans les véhicules professionnels (par exemple : les voitures de police, les taxis, les coursiers, les flottes de camion, les flottes de pêche, les services militaires de logistique, les services durgence, etc.).'
ents_link = {'dispatching':'https://fr.wiktionary.org/wiki/dispatching',
             'urgence':'https://fr.wikipedia.org/wiki/M%C3%A9decine_d%27urgence',
             'tablettes tactiles':'https://fr.wikipedia.org/wiki/Tablette_tactile'}
f=open('annotations.html', 'w')
f.write(gen_html(text, ents_link))
f.close()

In [6]:
def get_list_entities_links(text):
    result = {}
    doc = nlp(text)
    ents = set([x.text for x in doc.ents])
    for entity in ents:
        term, link, val = get_best_similar_wiki_page(entity)
        result[entity] = link
    return result

        
        
    

In [15]:
text = 'A heteroarylene ring  may comprise at least 1 to 3 heteroatoms preferably the lithium quinolate complex has the formula III, IV or V:whereinA1 to A6 are same or independently selected from CH, CR, N and O As VTE source a point source for organic materials is used as supplied by Kurt J A mobile terminal comprising: a housing comprising a rear case provided in a rear surface and a side case provided in a lateral surface Term "comprising" does not exclude other elements or steps, the term "a" or "an" does not exclude a plurality The invention will now be further illustrated by means of the following example, which is not intended to limit the scope in any manner The metal may be selected from an alkali, alkaline earth or rare earth metal In addition, this has the added advantage that a deburring process, that is, the removal of burrs or raised edges is not required 2424KCaScTiVCrMnFeCoNiCuZnGaGeAsSeBrKr0 In particular loss of transparency may lead to lower production of electricityIn other processes, where the chip is to be transferred, the chip, after pick-up, is transferred by the transfer head to a carrier tape, or to a chip flip unit (when the chip is to be flipped) In the inverse coordination complex, each ligand L may be coordinated to two different metal atoms of the first coordination sphere The at least one organic polymer may suitably comprise one or more selected from an epoxy-based polymer, a poly(meth)acrylate, a polyester or vinyl ester, or a polyurethane The magnetic flux guide layer (which for brevity may be referred to as the flux guide layer) may, as it is arranged below the magnetic tunnel junction pillars, include a set of portions, each portion being arranged under a respective one of the set of MTJ pillars'
dict_entities = get_list_entities_links(text)
f=open('annotations.html', 'w')
f.write(gen_html(text, dict_entities))
f.close()